Setup

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(tidytext)
library(caret)
library(fastDummies)
library(randomForest)
library(tidymodels)
library(factoextra)
library(cluster)
library(plotly)
library(skimr)



# https://www.openml.org/d/1590

raw_income = read_csv("https://raw.githubusercontent.com/chanks06/ml-model3/main/datasets/openml_1590.csv", na=c("?"))

income = read_csv("https://raw.githubusercontent.com/chanks06/ml-model3/main/datasets/openml_1590.csv", na=c("?")) %>%
  drop_na() %>%
  mutate(income_above_50k = class==">50K") %>%
  select(-class) %>%
  dummy_cols(remove_selected_columns = T)
skim(income)
Data summary
Name income
Number of rows 45222
Number of columns 105
_______________________
Column type frequency:
logical 1
numeric 104
________________________
Group variables None

Variable type: logical

skim_variable n_missing complete_rate mean count
income_above_50k 0 1 0.25 FAL: 34014, TRU: 11208

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 38.55 13.22 17 28.0 37 47 90 ▇▇▅▁▁
fnlwgt 0 1 189734.73 105639.20 13492 117388.2 178316 237926 1490400 ▇▁▁▁▁
education-num 0 1 10.12 2.55 1 9.0 10 13 16 ▁▁▇▃▁
capital-gain 0 1 1101.43 7506.43 0 0.0 0 0 99999 ▇▁▁▁▁
capital-loss 0 1 88.60 404.96 0 0.0 0 0 4356 ▇▁▁▁▁
hours-per-week 0 1 40.94 12.01 1 40.0 40 45 99 ▁▇▃▁▁
workclass_Federal-gov 0 1 0.03 0.17 0 0.0 0 0 1 ▇▁▁▁▁
workclass_Local-gov 0 1 0.07 0.25 0 0.0 0 0 1 ▇▁▁▁▁
workclass_Private 0 1 0.74 0.44 0 0.0 1 1 1 ▃▁▁▁▇
workclass_Self-emp-inc 0 1 0.04 0.19 0 0.0 0 0 1 ▇▁▁▁▁
workclass_Self-emp-not-inc 0 1 0.08 0.28 0 0.0 0 0 1 ▇▁▁▁▁
workclass_State-gov 0 1 0.04 0.20 0 0.0 0 0 1 ▇▁▁▁▁
workclass_Without-pay 0 1 0.00 0.02 0 0.0 0 0 1 ▇▁▁▁▁
education_1st-4th 0 1 0.00 0.07 0 0.0 0 0 1 ▇▁▁▁▁
education_5th-6th 0 1 0.01 0.10 0 0.0 0 0 1 ▇▁▁▁▁
education_7th-8th 0 1 0.02 0.13 0 0.0 0 0 1 ▇▁▁▁▁
education_9th 0 1 0.01 0.12 0 0.0 0 0 1 ▇▁▁▁▁
education_10th 0 1 0.03 0.16 0 0.0 0 0 1 ▇▁▁▁▁
education_11th 0 1 0.04 0.19 0 0.0 0 0 1 ▇▁▁▁▁
education_12th 0 1 0.01 0.11 0 0.0 0 0 1 ▇▁▁▁▁
education_Assoc-acdm 0 1 0.03 0.18 0 0.0 0 0 1 ▇▁▁▁▁
education_Assoc-voc 0 1 0.04 0.20 0 0.0 0 0 1 ▇▁▁▁▁
education_Bachelors 0 1 0.17 0.37 0 0.0 0 0 1 ▇▁▁▁▂
education_Doctorate 0 1 0.01 0.11 0 0.0 0 0 1 ▇▁▁▁▁
education_HS-grad 0 1 0.33 0.47 0 0.0 0 1 1 ▇▁▁▁▃
education_Masters 0 1 0.06 0.23 0 0.0 0 0 1 ▇▁▁▁▁
education_Preschool 0 1 0.00 0.04 0 0.0 0 0 1 ▇▁▁▁▁
education_Prof-school 0 1 0.02 0.13 0 0.0 0 0 1 ▇▁▁▁▁
education_Some-college 0 1 0.22 0.41 0 0.0 0 0 1 ▇▁▁▁▂
marital-status_Divorced 0 1 0.14 0.35 0 0.0 0 0 1 ▇▁▁▁▁
marital-status_Married-AF-spouse 0 1 0.00 0.03 0 0.0 0 0 1 ▇▁▁▁▁
marital-status_Married-civ-spouse 0 1 0.47 0.50 0 0.0 0 1 1 ▇▁▁▁▇
marital-status_Married-spouse-absent 0 1 0.01 0.11 0 0.0 0 0 1 ▇▁▁▁▁
marital-status_Never-married 0 1 0.32 0.47 0 0.0 0 1 1 ▇▁▁▁▃
marital-status_Separated 0 1 0.03 0.17 0 0.0 0 0 1 ▇▁▁▁▁
marital-status_Widowed 0 1 0.03 0.17 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Adm-clerical 0 1 0.12 0.33 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Armed-Forces 0 1 0.00 0.02 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Craft-repair 0 1 0.13 0.34 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Exec-managerial 0 1 0.13 0.34 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Farming-fishing 0 1 0.03 0.18 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Handlers-cleaners 0 1 0.05 0.21 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Machine-op-inspct 0 1 0.07 0.25 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Other-service 0 1 0.11 0.31 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Priv-house-serv 0 1 0.01 0.07 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Prof-specialty 0 1 0.13 0.34 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Protective-serv 0 1 0.02 0.15 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Sales 0 1 0.12 0.32 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Tech-support 0 1 0.03 0.17 0 0.0 0 0 1 ▇▁▁▁▁
occupation_Transport-moving 0 1 0.05 0.22 0 0.0 0 0 1 ▇▁▁▁▁
relationship_Husband 0 1 0.41 0.49 0 0.0 0 1 1 ▇▁▁▁▆
relationship_Not-in-family 0 1 0.26 0.44 0 0.0 0 1 1 ▇▁▁▁▃
relationship_Other-relative 0 1 0.03 0.17 0 0.0 0 0 1 ▇▁▁▁▁
relationship_Own-child 0 1 0.15 0.35 0 0.0 0 0 1 ▇▁▁▁▂
relationship_Unmarried 0 1 0.11 0.31 0 0.0 0 0 1 ▇▁▁▁▁
relationship_Wife 0 1 0.05 0.21 0 0.0 0 0 1 ▇▁▁▁▁
race_Amer-Indian-Eskimo 0 1 0.01 0.10 0 0.0 0 0 1 ▇▁▁▁▁
race_Asian-Pac-Islander 0 1 0.03 0.17 0 0.0 0 0 1 ▇▁▁▁▁
race_Black 0 1 0.09 0.29 0 0.0 0 0 1 ▇▁▁▁▁
race_Other 0 1 0.01 0.09 0 0.0 0 0 1 ▇▁▁▁▁
race_White 0 1 0.86 0.35 0 1.0 1 1 1 ▁▁▁▁▇
sex_Female 0 1 0.32 0.47 0 0.0 0 1 1 ▇▁▁▁▃
sex_Male 0 1 0.68 0.47 0 0.0 1 1 1 ▃▁▁▁▇
native-country_Cambodia 0 1 0.00 0.02 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Canada 0 1 0.00 0.06 0 0.0 0 0 1 ▇▁▁▁▁
native-country_China 0 1 0.00 0.05 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Columbia 0 1 0.00 0.04 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Cuba 0 1 0.00 0.05 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Dominican-Republic 0 1 0.00 0.05 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Ecuador 0 1 0.00 0.03 0 0.0 0 0 1 ▇▁▁▁▁
native-country_El-Salvador 0 1 0.00 0.06 0 0.0 0 0 1 ▇▁▁▁▁
native-country_England 0 1 0.00 0.05 0 0.0 0 0 1 ▇▁▁▁▁
native-country_France 0 1 0.00 0.03 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Germany 0 1 0.00 0.07 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Greece 0 1 0.00 0.03 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Guatemala 0 1 0.00 0.04 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Haiti 0 1 0.00 0.04 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Holand-Netherlands 0 1 0.00 0.00 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Honduras 0 1 0.00 0.02 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Hong 0 1 0.00 0.02 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Hungary 0 1 0.00 0.02 0 0.0 0 0 1 ▇▁▁▁▁
native-country_India 0 1 0.00 0.06 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Iran 0 1 0.00 0.04 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Ireland 0 1 0.00 0.03 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Italy 0 1 0.00 0.05 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Jamaica 0 1 0.00 0.05 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Japan 0 1 0.00 0.04 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Laos 0 1 0.00 0.02 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Mexico 0 1 0.02 0.14 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Nicaragua 0 1 0.00 0.03 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Outlying-US(Guam-USVI-etc) 0 1 0.00 0.02 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Peru 0 1 0.00 0.03 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Philippines 0 1 0.01 0.08 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Poland 0 1 0.00 0.04 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Portugal 0 1 0.00 0.04 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Puerto-Rico 0 1 0.00 0.06 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Scotland 0 1 0.00 0.02 0 0.0 0 0 1 ▇▁▁▁▁
native-country_South 0 1 0.00 0.05 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Taiwan 0 1 0.00 0.03 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Thailand 0 1 0.00 0.03 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Trinadad&Tobago 0 1 0.00 0.02 0 0.0 0 0 1 ▇▁▁▁▁
native-country_United-States 0 1 0.91 0.28 0 1.0 1 1 1 ▁▁▁▁▇
native-country_Vietnam 0 1 0.00 0.04 0 0.0 0 0 1 ▇▁▁▁▁
native-country_Yugoslavia 0 1 0.00 0.02 0 0.0 0 0 1 ▇▁▁▁▁
# Specify a recipe for pca
pca_rec <- recipe(~ ., data = income) %>% 
  update_role(income_above_50k, new_role = "ID") %>% 
  step_normalize(all_predictors()) %>% 
  step_pca(all_predictors(), num_comp = 2, id = "pca")

# Print out recipe
pca_rec
# Estimate required statistcs 
pca_estimates <- prep(pca_rec)

# Return preprocessed data using bake
features_2d <- pca_estimates %>% 
  bake(new_data = NULL)

# Print baked data set
features_2d %>% 
  slice_head(n = 5)
## # A tibble: 5 × 3
##   income_above_50k   PC1   PC2
##   <lgl>            <dbl> <dbl>
## 1 FALSE            -2.31  2.63
## 2 FALSE             2.21  1.54
## 3 TRUE              2.29 -1.03
## 4 TRUE              1.05  1.30
## 5 FALSE            -1.79  1.79
# Examine how much variance each PC accounts for
pca_estimates %>% 
  tidy(id = "pca", type = "variance") %>% 
  filter(str_detect(terms, "percent"))
## # A tibble: 208 × 4
##    terms            value component id   
##    <chr>            <dbl>     <int> <chr>
##  1 percent variance  4.38         1 pca  
##  2 percent variance  2.98         2 pca  
##  3 percent variance  2.51         3 pca  
##  4 percent variance  2.26         4 pca  
##  5 percent variance  1.87         5 pca  
##  6 percent variance  1.71         6 pca  
##  7 percent variance  1.64         7 pca  
##  8 percent variance  1.52         8 pca  
##  9 percent variance  1.38         9 pca  
## 10 percent variance  1.34        10 pca  
## # … with 198 more rows
theme_set(theme_light())
# Plot how much variance each PC accounts for
pca_estimates %>% 
  tidy(id = "pca", type = "variance") %>% 
  filter(terms == "percent variance") %>% 
  ggplot(mapping = aes(x = component, y = value)) +
  geom_col(fill = "midnightblue", alpha = 0.7) +
  ylab("% of total variance")

# Visualize PC scores
features_2d %>% 
  ggplot(mapping = aes(x = PC1, y = PC2)) +
  geom_point(size = 2, color = "dodgerblue3")+
  geom_jitter(alpha= 0.1)

10 pts k-means

capital-gain fnlwgt capital-loss

# Drop target column and normalize data
income_features<- recipe(~ ., data = income) %>%
 step_rm(income_above_50k) %>% 
  step_naomit(everything(), skip = TRUE) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>% #converts our factor columns into numeric binary (0 and 1) variables.
  step_zv(all_numeric(), -all_outcomes()) %>% ## step_zv(): removes any numeric variables that have zero variance.


  prep() %>% 
  bake(new_data = NULL)

# Print out data
income_features %>% 
  slice_head(n = 5)
## # A tibble: 5 × 104
##     age fnlwgt educati…¹ capit…² capit…³ hours…⁴ workc…⁵ workc…⁶ workc…⁷ workc…⁸
##   <dbl>  <dbl>     <dbl>   <dbl>   <dbl>   <dbl>   <int>   <int>   <int>   <int>
## 1    25 226802         7       0       0      40       0       0       1       0
## 2    38  89814         9       0       0      50       0       0       1       0
## 3    28 336951        12       0       0      40       0       1       0       0
## 4    44 160323        10    7688       0      40       0       0       1       0
## 5    34 198693         6       0       0      30       0       0       1       0
## # … with 94 more variables: `workclass_Self-emp-not-inc` <int>,
## #   `workclass_State-gov` <int>, `workclass_Without-pay` <int>,
## #   `education_1st-4th` <int>, `education_5th-6th` <int>,
## #   `education_7th-8th` <int>, education_9th <int>, education_10th <int>,
## #   education_11th <int>, education_12th <int>, `education_Assoc-acdm` <int>,
## #   `education_Assoc-voc` <int>, education_Bachelors <int>,
## #   education_Doctorate <int>, `education_HS-grad` <int>, …

One way we can try to find out is to use a data sample to create a series of clustering models with an incrementing number of clusters, and measure how tightly the data points are grouped within each cluster. A metric often used to measure this tightness is the within cluster sum of squares (WCSS), with lower values meaning that the data points are closer. You can then plot the WCSS for each model.

We’ll use the built-in kmeans() function, which accepts a data frame with all numeric columns as it’s primary argument to perform clustering - means we’ll have to drop the species column. For clustering, it is recommended that the data have the same scale. We can use the recipes package to perform these transformations.

set.seed(503)

# Create 10 models with 1 to 10 clusters
kclusts <- tibble(k = 1:10) %>% 
  mutate(
    model = map(k, ~ kmeans(x = income_features, centers = .x, nstart = 20)),
    glanced = map(model, glance)) %>% 
  unnest(cols = c(glanced))

# View results
kclusts
## # A tibble: 10 × 6
##        k model      totss tot.withinss betweenss  iter
##    <int> <list>     <dbl>        <dbl>     <dbl> <int>
##  1     1 <kmeans> 5.07e14      5.07e14   4.63e 3     1
##  2     2 <kmeans> 5.07e14      2.10e14   2.97e14     1
##  3     3 <kmeans> 5.07e14      1.14e14   3.93e14     2
##  4     4 <kmeans> 5.07e14      7.53e13   4.32e14     2
##  5     5 <kmeans> 5.07e14      5.40e13   4.53e14     2
##  6     6 <kmeans> 5.07e14      3.90e13   4.68e14     3
##  7     7 <kmeans> 5.07e14      2.97e13   4.78e14     4
##  8     8 <kmeans> 5.07e14      2.40e13   4.83e14     4
##  9     9 <kmeans> 5.07e14      1.95e13   4.88e14     3
## 10    10 <kmeans> 5.07e14      1.62e13   4.91e14     5
# Plot Total within-cluster sum of squares (tot.withinss)
kclusts %>% 
  ggplot(mapping = aes(x = k, y = tot.withinss)) +
  geom_line(size = 1.2, alpha = 0.5, color = "dodgerblue3") +
  geom_point(size = 2, color = "dodgerblue3")

set.seed(503)
# Fit and predict clusters with k = 3
final_kmeans <- kmeans(income_features, centers =2, nstart = 100, iter.max = 1000)

# Add cluster prediction to the data set
results <- augment(final_kmeans, income_features)   %>% 
# Bind pca_data - features_2d
  bind_cols(features_2d)

results %>% 
  slice_head(n = 5)
## # A tibble: 5 × 108
##     age fnlwgt educati…¹ capit…² capit…³ hours…⁴ workc…⁵ workc…⁶ workc…⁷ workc…⁸
##   <dbl>  <dbl>     <dbl>   <dbl>   <dbl>   <dbl>   <int>   <int>   <int>   <int>
## 1    25 226802         7       0       0      40       0       0       1       0
## 2    38  89814         9       0       0      50       0       0       1       0
## 3    28 336951        12       0       0      40       0       1       0       0
## 4    44 160323        10    7688       0      40       0       0       1       0
## 5    34 198693         6       0       0      30       0       0       1       0
## # … with 98 more variables: `workclass_Self-emp-not-inc` <int>,
## #   `workclass_State-gov` <int>, `workclass_Without-pay` <int>,
## #   `education_1st-4th` <int>, `education_5th-6th` <int>,
## #   `education_7th-8th` <int>, education_9th <int>, education_10th <int>,
## #   education_11th <int>, education_12th <int>, `education_Assoc-acdm` <int>,
## #   `education_Assoc-voc` <int>, education_Bachelors <int>,
## #   education_Doctorate <int>, `education_HS-grad` <int>, …
# Plot km_cluster assignmnet on the PC data
cluster_plot <- results %>% 
    ggplot(mapping = aes(x = PC1, y = PC2)) +

  geom_point(aes(shape = .cluster), size = 2) +
  scale_color_manual(values = c("darkorange","purple","cyan4"))

# Make plot interactive
ggplotly(cluster_plot)
clust_spc_plot <- results %>% 
  ggplot(mapping = aes(x = PC1, y = PC2)) +
  geom_point(aes(shape = .cluster, color = income_above_50k), size = 2, alpha = 0.8) +
  scale_color_manual(values = c("darkorange","purple","cyan4"))

# Make plot interactive
ggplotly(clust_spc_plot)

10 pts supervised learning